import requests
from lxml import etree
import pprint
from pymongo import MongoClient
import time

class BookInfo:
    def __init__(self):
        self.headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36' }
        self.base_url='http://books.toscrape.com/'
        self.page_num = 1
        self.link_all='http://books.toscrape.com/catalogue/'

     #获取左侧每个标签的url
    def left_all_url(self):
        response=requests.get(self.base_url,headers=self.headers)
        #获取左侧图书分类的链接
        html=etree.HTML(response.text)
        link_left=html.xpath('//ul[@class="nav nav-list"]/li/ul/li/a/@href')
        # 获取左边完整链接
        url_all = [self.base_url + link_left[i] for i in range(len(link_left))]
        return url_all

    #获取某个类别的链接页面进行分析
    def page_content(self,link):
        mql_response = requests.get(link, headers=self.headers)
        # 获取result结果判断有多少页
        mql_html = etree.HTML(mql_response.text)
        result = mql_html.xpath('//div[@class="col-sm-8 col-md-9"]/form/strong[1]/text()')[0].strip()
        if int(result) / 20 > 1:
            self.page_num = self.page_num + int(int(result) / 20)
            print('该页面总共有%s页,共有%s条结果' %(self.page_num,result))
            now_url = '/'.join(link.split('/')[:-1]) + '/page-%s.html'


        else:
            print('该页面只有%s页，共有%s条结果' % (self.page_num,result))
            now_url = link
        return now_url

    def main(self,now_url):
        for i in range(1, self.page_num + 1):
            try:
               now_urll= now_url%i
            except:
                now_urll=now_url
            tb = requests.get(now_urll, headers=self.headers).text
            tb_response = etree.HTML(tb)
            #获取详情页链接
            xx_link = tb_response.xpath('//div[@class="image_container"]/a/@href')
            xx_links = [self.link_all + '/'.join(xx_link[i].split('/')[-2:]) for i in range(len(xx_link))]
            save_all = []
            for i in xx_links:
                finally_response = requests.get(i, headers=self.headers)
                finally_html = etree.HTML(finally_response.text)
                all = {}
                all['books_name'] = finally_html.xpath('//div[@class="col-sm-6 product_main"]/h1/text()')[0]
                all['money_price'] = finally_html.xpath('//div[@class="col-sm-6 product_main"]/p[1]/text()')[0][1:]
                all['content'] = finally_html.xpath('//article[@class="product_page"]/p/text()')[0]
                save_all.append(all)
                # pprint.pprint(save_all)
            self.connect_Mongodb(save_all)


    def connect_Mongodb(self,data):
        ##连接数据库
        client = MongoClient()
        # 创建数据库
        database = client['books']
        # 创建集合
        db = database['book_content']
        # 保存到mongodb数据库当中
        db.insert_many(data)




if __name__=='__main__':
    mql=BookInfo()
    a=mql.left_all_url()
    #可以随意选择一个分类进行
    b=mql.page_content(a[1])
    mql.main(b)

